import re
from urllib.request import Request, urlopen, urlretrieve, urlcleanup

blog_csdn = "https://blog.csdn.net/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"}
blog_csdn = Request(blog_csdn, headers=headers)  # 网络请求配置

'''
发起请求
'''
data = urlopen(blog_csdn).read()
data = data.decode("utf-8", "ignore")
print(data)

"""
正则匹配
"""
# pat_url= '<div class="title">\n?\s*<h2>\n\s*<a href="(https://blog.csdn.net/.*?)"'
pat_url = '<h2>\n\s*<a href="(https://blog.csdn.net/.*?)"'
pat_name = """<h2>\n\s*<a href="https://blog.csdn.net/.*'\s*>\s*(.*?)\s*</a>"""
filter_data_url = re.compile(pat_url).findall(data)
filter_data_name = re.compile(pat_name).findall(data)
print(len(filter_data_url))
print(len(filter_data_name))
for i in range(0,len(filter_data_name)):
    str_fom = "[\w+]" #提取字母数字下划线就可,[]可以用来提取想要的东西
    file_name=re.compile(str_fom).findall(filter_data_name[i])
    urlretrieve(filter_data_url[i],f"../files/csdn文章/{file_name}.html")
    print(filter_data_name[i])
